import datetime

import requests
from bs4 import BeautifulSoup, element
import os
import time
from urllib.parse import urljoin

from holoviews.operation import element

# 配置参数
BASE_URL = 'https://www.runoob.com/java/java-tutorial.html'
OUTPUT_DIR = 'java_modifiers_data'
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}


def init_directory():
    """创建输出目录"""
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)


def get_page_content(url):
    """获取页面内容"""
    try:
        response = requests.get(url, headers=HEADERS)
        response.raise_for_status()
        response.encoding = 'utf-8'
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None


def parse_main_page(html):
    """解析主页面内容"""
    soup = BeautifulSoup(html, 'lxml')
    main_content = soup.find('div', class_='article-body')

    # 提取基本信息
    title = soup.find('h1').text.strip()
    print(f"正在解析主页面: {title}")

    # 提取所有章节内容
    sections = []
    current_section = None # 显式声明变量
    for section in main_content.find_all(['h2', 'h3', 'p', 'pre']):
        if section.name in ['h2', 'h3']:
            current_section = {
                'title': section.text.strip(),
                'content': []
            }
            sections.append(current_section)
        elif current_section is not None:
            if section.name == 'pre':
                current_section['content'].append(('code', section.text.strip()))
            else:
                current_section['content'].append(('text', section.text.strip()))
        else:
            print(f"警告：发现标题前的内容 -> {section.text[:30]}...")

    # 提取子页面链接
    sub_links = []
    for link in main_content.find_all('a', href=True):
        full_url = urljoin(BASE_URL, link['href'])
        if full_url != BASE_URL and 'java-modifier-types' not in full_url:
            sub_links.append(full_url)

    return {
        'title': title,
        'sections': sections,
        'sub_links': list(set(sub_links))  # 去重
    }


def parse_sub_page(html):
    """解析子页面内容"""
    soup = BeautifulSoup(html, 'lxml')
    main_content = soup.find('div', class_='article-body')

    if not main_content:
        return None

    title = soup.find('h1').text.strip()
    print(f"正在解析子页面: {title}")

    content = []
    for element in main_content.find_all(['h2', 'h3', 'p', 'pre', 'table']):
        if element.name in ['h2', 'h3']:
            content.append(('header', element.text.strip()))
        elif element.name == 'pre':
            content.append(('code', element.text.strip()))
        elif element.name == 'table':
            rows = []
            for tr in element.find_all('tr'):
                cells = [td.text.strip() for td in tr.find_all(['th', 'td'])]
                rows.append(cells)
            content.append(('table', rows))
        else:
            content.append(('text', element.text.strip()))

    return {
        'title': title,
        'content': content
    }


def save_data(data, filename):
    """保存数据到文件"""
    path = os.path.join(OUTPUT_DIR, filename)
    with open(path, 'w', encoding='utf-8') as f:
        if isinstance(data, dict):
            f.write(f"标题: {data.get('title', '')}\n\n")
            if 'sections' in data:  # 主页面
                for section in data['sections']:
                    f.write(f"## {section['title']} ##\n")
                    for content_type, text in section['content']:
                        if content_type == 'code':
                            f.write(f"\n代码示例:\n{text}\n")
                        else:
                            f.write(f"{text}\n")
                    f.write("\n")
            else:  # 子页面
                for content_type, content in data['content']:
                    if content_type == 'header':
                        f.write(f"\n### {content} ###\n")
                    elif content_type == 'code':
                        f.write(f"\n代码示例:\n{content}\n")
                    elif content_type == 'table':
                        f.write("\n表格数据:\n")
                        for row in content:
                            f.write(" | ".join(row) + "\n")
                    else:
                        f.write(f"{content}\n")
        else:
            f.write(data)
    print(f"已保存文件: {filename}")

def generate_filename():
    now = datetime.datetime.now()
    timestamp = now.strftime("%Y%m%d_%H%M%S")
    return f"{timestamp}.txt"
def main():
    init_directory()

    # 处理主页面
    main_html = get_page_content(BASE_URL)
    if not main_html:
        return

    main_data = parse_main_page(main_html)
    save_data(main_data, generate_filename())

    # 处理子页面
    for idx, sub_url in enumerate(main_data['sub_links'], 1):
        time.sleep(1)  # 礼貌性延迟
        sub_html = get_page_content(sub_url)
        if not sub_html:
            continue

        sub_data = parse_sub_page(sub_html)
        if sub_data:
            save_data(sub_data, f"{idx:02d}_{sub_data['title']}.txt")


if __name__ == '__main__':
    main()